{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import scanpy as sc\n",
    "import anndata\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import gc\n",
    "data_type = 'float32'\n",
    "\n",
    "# this line forces theano to use the GPU and should go before importing cell2location\n",
    "os.environ[\"THEANO_FLAGS\"] = 'device=cuda,floatX=' + data_type + ',force_device=True'\n",
    "\n",
    "import cell2location\n",
    "\n",
    "import matplotlib as mpl\n",
    "from matplotlib import rcParams\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set paths to data and results used through the document:\n",
    "sp_data_folder = '/net/data.isilon/ag-saez/bq_shared/scellMI/raw_visium/'\n",
    "results_folder = '/net/data.isilon/ag-saez/bq_rramirez/visiumMI_revisions/results/deconvolution/c2l/'\n",
    "regression_model_output = 'rmodelRegressionGeneBackgroundCoverageTorch_26covariates_60574cells_12394genestestMI'\n",
    "reg_path = f'{results_folder}{regression_model_output}/'\n",
    "sample_names = [\"157771\", \"157772\", \"157775\", \"157777\", \"157779\", \"157781\", \"157782\", \"157785\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reading and analyzing visium data\n",
    "def read_and_qc(sample_name, path=sp_data_folder + 'rawdata/'):\n",
    "    r\"\"\" This function reads the data for one 10X spatial experiment into the anndata object.\n",
    "    It also calculates QC metrics. Modify this function if required by your workflow.\n",
    "\n",
    "    :param sample_name: Name of the sample\n",
    "    :param path: path to data\n",
    "    \"\"\"\n",
    "\n",
    "    adata = sc.read_visium(path + str(sample_name),\n",
    "                           count_file='filtered_feature_bc_matrix.h5', load_images=True)\n",
    "    adata.obs['sample'] = sample_name\n",
    "    adata.var['SYMBOL'] = adata.var_names\n",
    "    #adata.var.rename(columns={'gene_ids': 'ENSEMBL'}, inplace=True)\n",
    "    #adata.var_names = adata.var['ENSEMBL']\n",
    "    #adata.var.drop(columns='ENSEMBL', inplace=True)\n",
    "\n",
    "    # Calculate QC metrics\n",
    "    sc.pp.calculate_qc_metrics(adata, inplace=True)\n",
    "    adata.var['mt'] = [gene.startswith('MT-') for gene in adata.var['SYMBOL']]\n",
    "    adata.obs['mt_frac'] = adata[:, adata.var['mt'].tolist()].X.sum(1).A.squeeze()/adata.obs['total_counts']\n",
    "\n",
    "    # add sample name to obs names\n",
    "    adata.obs[\"sample\"] = [str(i) for i in adata.obs['sample']]\n",
    "    adata.obs_names = adata.obs[\"sample\"] \\\n",
    "                          + '_' + adata.obs_names\n",
    "    adata.obs.index.name = 'spot_id'\n",
    "\n",
    "    return adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the data into anndata objects\n",
    "slides = []\n",
    "for i in sample_names:\n",
    "    slides.append(read_and_qc(i, path=sp_data_folder))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Combine anndata objects together\n",
    "adata = slides[0].concatenate(\n",
    "    slides[1:],\n",
    "    batch_key=\"sample\",\n",
    "    uns_merge=\"unique\",\n",
    "    batch_categories=sample_names,\n",
    "    index_unique=None\n",
    ")\n",
    "\n",
    "adata.var_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# mitochondria-encoded (MT) genes should be removed for spatial mapping\n",
    "adata.obsm['mt'] = adata[:, adata.var['mt'].values].X.toarray()\n",
    "adata = adata[:, ~adata.var['mt'].values]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_vis = adata.copy()\n",
    "adata_vis.raw = adata_vis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## snRNAseq reference (raw counts)\n",
    "adata_snrna_raw = sc.read(f'{reg_path}sc.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Column name containing cell type annotations\n",
    "covariate_col_names = 'deconv_col'\n",
    "\n",
    "# Extract a pd.DataFrame with signatures from anndata object\n",
    "inf_aver = adata_snrna_raw.raw.var.copy()\n",
    "inf_aver = inf_aver.loc[:, [f'means_cov_effect_{covariate_col_names}_{i}' for i in adata_snrna_raw.obs[covariate_col_names].unique()]]\n",
    "from re import sub\n",
    "inf_aver.columns = [sub(f'means_cov_effect_{covariate_col_names}_{i}', '', i) for i in adata_snrna_raw.obs[covariate_col_names].unique()]\n",
    "inf_aver = inf_aver.iloc[:, inf_aver.columns.argsort()]\n",
    "\n",
    "# normalise by average experiment scaling factor (corrects for sequencing depth)\n",
    "inf_aver = inf_aver * adata_snrna_raw.uns['regression_mod']['post_sample_means']['sample_scaling'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# selecting most informative genes based on specificity\n",
    "selection_specificity = 0.2\n",
    "\n",
    "# normalise expression signatures:\n",
    "cell_state_df_norm = (inf_aver.T / inf_aver.sum(1)).T\n",
    "# apply cut off:\n",
    "cell_state_df_norm = (cell_state_df_norm > selection_specificity)\n",
    "\n",
    "# check the number of markers per cell type\n",
    "cell_state_df_norm.sum(0), (cell_state_df_norm.sum(1) > 0).sum(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# now we don't need to keep the scRNA-seq data set and a list with slides in memory\n",
    "del adata_snrna_raw, slides\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.settings.set_figure_params(dpi = 100, color_map = 'viridis', dpi_save = 100,\n",
    "                              vector_friendly = True, format = 'pdf',\n",
    "                              facecolor='white')\n",
    "\n",
    "r = cell2location.run_cell2location(\n",
    "\n",
    "      # Single cell reference signatures as pd.DataFrame\n",
    "      # (could also be data as anndata object for estimating signatures\n",
    "      #  as cluster average expression - `sc_data=adata_snrna_raw`)\n",
    "      sc_data=inf_aver,\n",
    "      # Spatial data as anndata object\n",
    "      sp_data=adata_vis,\n",
    "\n",
    "      # the column in sc_data.obs that gives cluster idenitity of each cell\n",
    "      summ_sc_data_args={'cluster_col': \"deconv_col\",\n",
    "                         # select marker genes of cell types by specificity of their expression signatures\n",
    "                         'selection': \"cluster_specificity\",\n",
    "                         # specificity cutoff (1 = max, 0 = min)\n",
    "                         'selection_specificity': 0.2\n",
    "                        },\n",
    "\n",
    "      train_args={'use_raw': True, # By default uses raw slots in both of the input datasets.\n",
    "                  'n_iter': 40000, # Increase the number of iterations if needed (see QC below)\n",
    "\n",
    "                  # Whe analysing the data that contains multiple experiments,\n",
    "                  # cell2location automatically enters the mode which pools information across experiments\n",
    "                  'sample_name_col': 'sample'}, # Column in sp_data.obs with experiment ID (see above)\n",
    "\n",
    "\n",
    "      export_args={'path': results_folder, # path where to save results\n",
    "                   'run_name_suffix': '' # optinal suffix to modify the name the run\n",
    "                  },\n",
    "\n",
    "      model_kwargs={ # Prior on the number of cells, cell types and co-located groups\n",
    "\n",
    "                    'cell_number_prior': {\n",
    "                        # - N - the expected number of cells per location:\n",
    "                        'cells_per_spot': 8,\n",
    "                        # - A - the expected number of cell types per location:\n",
    "                        'factors_per_spot': 4,\n",
    "                        # - Y - the expected number of co-located cell type groups per location\n",
    "                        'combs_per_spot': 3\n",
    "                    },\n",
    "\n",
    "                     # Prior beliefs on the sensitivity of spatial technology:\n",
    "                    'gene_level_prior':{\n",
    "                        # Prior on the mean\n",
    "                        'mean': 1/2,\n",
    "                        # Prior on standard deviation,\n",
    "                        # a good choice of this value should be at least 2 times lower that the mean\n",
    "                        'sd': 1/4\n",
    "                    }\n",
    "      }\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
